R Markdown

Note: if you downloaded this somewhere other than ~/Downloads, change paths appropriately

First, let’s set up our main dataset using data from flyatlas, Kondo et al. 2017, and Neme and Tautz 2014

#Atac-seq data: 
ovaryPeaks<-read.delim("~/Downloads/tissueSpecificityData/ovaryPeaks.narrowPeak", header=F)
testisPeaks<-read.delim("~/Downloads/tissueSpecificityData/testisPeaks.narrowPeak", header=F)
S2Peaks<-read.delim("~/Downloads/tissueSpecificityData/S2Peaks.narrowPeak", header=F)


#TF data from FLYNET
flynet<-read.delim("~/Downloads/tissueSpecificityData/tf_gene.txt")




dataset <-read.table("~/Downloads/tissueSpecificityData/190508_flyatlas_allgenes_alltissues_uniq_FPKM.txt")
dataset<-dataset %>% pivot_longer(-merged.ID, names_to="Tissue", values_to="FPKM")
names(dataset)<-c("gene", "Tissue","FPKM")
dataset$Tissue<-gsub(dataset$Tissue, pattern="ale", replacement="ale ")
dataset<-subset(dataset, !Tissue %in% c("Male Whole", "Female whole"))




dataset <-unique(dataset)

#colnames(dataset)<-gsub(x=colnames(dataset), pattern="merged.ID", replacement="gene")

#from:http://genesdev.cshlp.org/content/suppl/2017/10/19/31.18.1841.DC1/Supplemental_TableS1.xlsx
kondoages<-read.csv("~/Downloads/tissueSpecificityData/kondoages.csv")
kondoages<-kondoages[,c(1,7)]
names(kondoages)<-c("gene", "Age")

#from zhang ages http://gentree.ioz.ac.cn/download/dm6_ver78_age.tsv
Zhangages<-read.delim("~/Downloads/tissueSpecificityData/dm6_ver78_age.tsv")
names(Zhangages)<-c("gene", "transcript", "Age")



#Here I'm combining consensus gene ages from Neme-Tautz and Kondo, since they cover different periods.  Ambiguous genes are excluded.
kondoages<-join(kondoages, subset(Zhangages, Age== 0), by="gene", type="inner")
kondoages<-kondoages[,c(1,2)]
kondoages<-droplevels(subset(kondoages, Age== "Pan-drosophilid" | Age== "Bilateria" | Age=="Eukaryota" | Age== "Diptera" | Age=="Cellular_Organisms" | Age=="Diptera"))

Zhangages$Age<-as.character(Zhangages$Age)
mergedages<-rbind(kondoages, subset(Zhangages, Age != 0)[,c(1,3)])

dataset<-join(mergedages, dataset, by="gene")
dataset <-na.omit(dataset)

#now gotta log transform it
dataset$logFPKM<-log2(dataset$FPKM+1)
dataset[is.na(dataset)] <- 0


dataset<-mutate(dataset %>% group_by(gene), tau=tau(logFPKM))   



dataset$Age<-plyr::mapvalues(dataset$Age, from=c("6", "5", "4", "3", "2", "1", "Pan-drosophilid", "Diptera", "Bilateria", "Eukaryota", "Cellular_Organisms"), to=c("Drosophilid", "Drosophilid", "Drosophilid", "Drosophilid", "Drosophilid", "Drosophilid", "Pre-Drosophilid", "Pre-Drosophilid", "Pre-Bilateria", "Pre-Bilateria", "Pre-Bilateria"))
#for every gene, make a column with the tissue in which it is maximally expressed
dataset<-subset(dataset, Tissue !="Whole body")
flydata<-dataset
flydata$Tissue<-gsub(flydata$Tissue, pattern = "Spermatheca", replacement=" Spermatheca")
flydata$Tissue<-gsub(flydata$Tissue, pattern = "Male Testis", replacement="Testis")
flydata$Tissue<-gsub(flydata$Tissue, pattern = "Female Ovary", replacement="Ovary")
flydata<-mutate(flydata %>% group_by(gene), maxtissue=Tissue[which.max(FPKM)])

flydata$Age<-factor(flydata$Age, levels=c("Drosophilid", "Pre-Drosophilid", "Pre-Bilateria"))

Let’s make some figures! Here’s figure 1:

#figure 1A: 
#add medians
flydata2<- flydata %>% group_by(Age) %>%mutate(medtau=median(na.omit(tau)))

#uncomment the geom_text part to add medians to ggplot. it's really slow, so I don't recommend it.
p1<-ggplot(unique(flydata2[,c("gene", "Age", "tau", "medtau")]), aes(x=Age, y=tau)) +geom_violin(aes(fill=factor(Age)))+theme_classic()+
scale_fill_manual(values = viridis_pal(option = "viridis")(4))+stat_summary(
    aes( x=Age, y=tau),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", size=0.5
    )+ylab("Tau")+
  xlab(sprintf('Gene age group (Young \u2192 Old)'))+ theme(legend.position="none")+theme(axis.text=element_text(size=10))




rm(flydata2)
#let's now look at the proportion of genes with max expression in testis and ovary


proptestisovary<- flydata %>% group_by(Age,  maxtissue) %>% summarise(n=n()/25) %>% dplyr::mutate(proportion=n/sum(n), total=sum(n)) 


tmp<-data.frame()
for (i in 1:nrow(proptestisovary)){
  tmp2<-prop.test(proptestisovary$n[i], proptestisovary$total[i])$conf.int[c(1,2)]
tmp<-rbind(tmp, tmp2)
}




colnames(tmp)<-c("min", "max")
proptestisovary$min<-tmp$min
proptestisovary$max<-tmp$max


proptestisovary$maxtissue<-gsub(proptestisovary$maxtissue, pattern="Male Testis", replacement="Testis")
proptestisovary$maxtissue<-gsub(proptestisovary$maxtissue, pattern="Female Ovary", replacement="Ovary")


names(proptestisovary)[names(proptestisovary)=="maxtissue"]<-"Tissue"

flyothertissues<-subset(proptestisovary, Tissue!="Male Testis")



p2<-ggplot(subset(proptestisovary, Tissue %in% c("Testis", "Ovary", "Male Carcass", "Female Carcass")), aes(x=Age, y=proportion, group=Tissue, col=Tissue, fill=Tissue))+theme_classic()+
  geom_point(aes(fill=Tissue))+geom_line(size=1)+ylab("Proportions of \n genes  with maximum \n expression in tissue")+
  geom_errorbar(aes(ymin=min, ymax=max), width=0.08, size=0.7)+theme(axis.text=element_text(size=10), legend.text=element_text(size=10))+xlab("Gene age group")+scale_color_manual(values=viridis_pal(option = "viridis")(5))


#proportion of expressed genes


tissuetotals<-data.frame(table(unique(flydata[,c("gene", "Age")])$Age))
names(tissuetotals)<-c("Age", "n")

propexpressed<-data.frame(subset(flydata, FPKM>2)%>% group_by(Age, Tissue) %>% dplyr::summarise(total=n()))
propexpressed<-join(propexpressed, tissuetotals)
propexpressed$prop<-propexpressed$total/propexpressed$n

tmp<-data.frame()
for (i in c(1:nrow(propexpressed))){
  tmp2<-prop.test(propexpressed$total[i], propexpressed$n[i])$conf.int
tmp<-rbind(tmp, tmp2)
}

colnames(tmp)<-c("min", "max")
propexpressed$min<-tmp$min
propexpressed$max<-tmp$max


p3<-ggplot(subset(propexpressed, Tissue %in% c("Female Carcass", "Male Carcass", "Testis", "Ovary")), aes(x=Age, y=prop, group=Tissue, col=Tissue, fill=Tissue))+theme_classic()+
  geom_point(aes(fill=Tissue))+geom_line(size=1)+ylab("Proportion of \n genes expressed with \n FPKM>2 in tissue")+
  geom_errorbar(aes(ymin=min, ymax=max), width=0.08, size=0.7)+theme(axis.text=element_text(size=10), legend.text=element_text(size=10))+xlab("Gene age group")+scale_color_manual(values=viridis_pal(option = "viridis")(5))


plot_grid(p1, p2,p3, labels=c("A", "B","C"),  nrow=3)

#supplemental figure 1:
supp_figure1<-ggplot(proptestisovary, aes(x=Age, y=proportion, group=Tissue, col=Tissue, fill=Tissue))+theme_classic()+
  geom_point(aes(fill=Tissue))+geom_line()+ylab("Proportions of \n genes  with maximum \n expression in tissue")+
  geom_errorbar(aes(ymin=min, ymax=max), width=0.4)+facet_wrap(~Tissue)+theme(axis.text=element_text(size=10),legend.position='none', axis.text.x=element_text(angle=90))+xlab("Gene age group")

Figure 2: Expression for all tissues

#flydatascaled <-data.frame(flydata[,c(2,3)], t(apply(flydata[,c(4:30)],1,redist.fun)))
#flydatascaled <-data.frame(flydata[,c(2,3)],flydata[,c(4:30)])




res = flydata %>% group_by(Tissue) %>% 
  do(tidy(aov(logFPKM ~ Age, data=.)))


res2 = flydata %>% group_by(Tissue) %>%  
  do(tidy(TukeyHSD(aov(logFPKM ~ Age, data=.))))%>% 
  group_by(Tissue) %>% summarise(sum=sum(abs(estimate)))


stat.test <- flydata %>%
  group_by(Tissue) %>%
  wilcox_test(logFPKM ~ Age) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance() %>% add_y_position(fun ="max")
stat.test$y.position[]<-c(12,16,19)

#write.table(x=data.frame(stat.test[,c("Tissue", "group1","group2", "p", "p.adj")]), sep='\t',col.names=T, file =  "~/Box Sync/Evan/Zhao lab notebook/Writing/Tissue specificity paper/Supplemental_table_1.txt")
#is this the best way?


p1<-ggplot(flydata, aes(x=Age, y=logFPKM))+geom_boxplot(outlier.shape=NA,  width=0.5, aes(fill=Age, col=Age))+
  theme_classic()+facet_wrap(~Tissue)+ylab("Log2(FPKM+1)")+
   ylim(0,21)+
  theme(axis.text=element_text(size=10))+stat_summary(geom = "crossbar", width=0.65, fatten=0.2, color="white", fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) })+
  #scale_y_continuous(breaks=c(0,.25,0.5,.75,1),limits = c(0,1.5))+
  xlab("Gene age group")+stat_pvalue_manual(stat.test)+theme(axis.text.x=element_blank())+scale_fill_manual(values = viridis_pal(option = "viridis")(4))+scale_color_manual(values = viridis_pal(option = "viridis")(4))





res<-na.omit(res)
res<-join(res, res2)
#
#
#

####proposed replacement for figure 2B
#need to order largest to smallest
p2<-ggplot(res, aes(x = reorder(Tissue, -statistic), y=statistic))+geom_bar(stat="identity",fill =ifelse(res$Tissue=="Testis", "#440154FF", ifelse(res$Tissue=="Ovary", "#31688EFF","grey")))+theme_classic() + theme(legend.position='none', axis.text.x=element_text(angle=90), axis.title.x=element_blank())+scale_fill_hue(c=45, l=40)+ylab("ANOVA F statistic") + xlab("Tissue")+theme(axis.text=element_text(size=10), axis.title=element_text(size=10))


p3<-ggplot(res, aes(x = reorder(Tissue, -sum), y=sum))+geom_bar(stat="identity",fill =ifelse(res$Tissue=="Testis", "#440154FF", ifelse(res$Tissue=="Ovary", "#31688EFF","grey")))+theme_classic() + theme(legend.position='none', axis.text.x=element_text(angle=90), axis.title.x=element_blank())+scale_fill_hue(c=45, l=40)+ylab("Summed difference\nof group means") +theme(axis.text=element_text(size=10), axis.title=element_text(size=10))

p2<-plot_grid(p2, p3, nrow=1)
#
#
#
#

#p2<-ggplot(res, aes(x=sum, y=statistic, col=Tissue) )+ theme_classic()+theme(axis.text=element_text(size=10))+geom_point()+geom_text_repel(aes(label=Tissue ),nudge_y = -.05)+theme(legend.position="none", axis.title = element_text(size=10),axis.text=element_text(size=10))+xlab("Summed difference of group means")+ylab("Anova F-statistic")
  
plot_grid(p1, p2, nrow=2, labels=c("A", "B"), rel_widths = c(1.5,1), rel_heights=c(1.2,1))

Figure 3

flydatascaled<-mutate(flydata %>% group_by(gene), scaledExp=redist.fun(logFPKM))   




#flydatascaled <-data.frame(flydata[,c(2,3)], t(apply(flydata[,c(4:30)],1,redist.fun)))

connectivity<-data.frame()

genes<-unique(flydatascaled$gene)


#this is the code that makes connectivity.csv. this takes forever to run, so uncomment at your own risk
#for (i in genes[ genes %in% unique(flynet$FLY_TARGET_GENE)]){
#  tmp<-subset(flynet, FLY_TARGET_GENE== i )$FLY_TF_GENE
#  tmp<-subset(flydatascaled, gene %in% tmp)
#  tmp<-tmp %>% group_by(Tissue)%>% summarize(connectivity=sum(scaledExp))
# tmp$gene<-i
#  connectivity<-rbind(connectivity, unique(tmp))
#}


#write.csv(connectivity, "~/Downloads/tissueSpecificityData/connectivity.csv", row.names = F)

connectivity<-read.csv("~/Downloads/tissueSpecificityData/connectivity.csv")

connectivity<-join(connectivity, flydatascaled, c("gene", "Tissue"))
tmp<-gather(connectivity, key="Tissue", value="activity", -gene, -Age)
tmp<-na.omit(tmp)
#write.csv(tmp, "~/Box Sync/Evan/Zhao lab notebook/Writing/Tissue specificity paper/Supplemental_file3.csv")

#get correlation coefficients
connectivity %>% group_by(Tissue) %>% summarise(corr=cor(connectivity, logFPKM, method="spearman")) %>%print(n=100)
## # A tibble: 25 x 2
##    Tissue                     corr
##    <chr>                     <dbl>
##  1 Female Anal               0.492
##  2 Female Brain              0.519
##  3 Female Carcass            0.489
##  4 Female Crop               0.543
##  5 Female Eye                0.490
##  6 Female Head               0.431
##  7 Female Hindgut            0.516
##  8 Female Mated Spermatheca  0.542
##  9 Female Midgut             0.516
## 10 Female Salivary           0.601
## 11 Female Tubule             0.544
## 12 Female Virgin Spermatheca 0.545
## 13 Male Accessory            0.526
## 14 Male Anal                 0.527
## 15 Male Brain                0.525
## 16 Male Carcass              0.411
## 17 Male Crop                 0.527
## 18 Male Eye                  0.484
## 19 Male Head                 0.430
## 20 Male Hindgut              0.512
## 21 Male Midgut               0.500
## 22 Male Salivary             0.518
## 23 Male Tubule               0.503
## 24 Ovary                     0.667
## 25 Testis                    0.283
ks.test(subset(connectivity, Tissue=="Testis")$connectivity, subset(connectivity, Tissue=="Ovary")$connectivity)
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  subset(connectivity, Tissue == "Testis")$connectivity and subset(connectivity, Tissue == "Ovary")$connectivity
## D = 0.22545, p-value < 2.2e-16
## alternative hypothesis: two-sided
tmp4<-droplevels(subset(connectivity, Tissue=="Testis" | Tissue=="Ovary"))
stat.test1 <- compare_means(connectivity ~ Tissue, data=tmp4,group.by="Age" )
#plot of TF activity by gene age, testis and ovary
tmp4<-tmp4 %>% group_by(Age, Tissue)%>%mutate(med=median(connectivity))



p1<-ggplot(tmp4,aes(x= Age, y=connectivity ))+geom_violin(aes( fill=Tissue))+
       stat_pvalue_manual(data=stat.test1, label="p.adj= {format.pval(p.adj)}",y.position=c(32,32,32), x="Age")+theme_classic()+theme(axis.text=element_text(size=10))+
 xlab("Gene age group")+ylab("Relative transcription factor activity")+scale_fill_manual(values = viridis_pal()(4)[c(1,3)])+stat_summary(
    aes( x=Age, y=connectivity, group=Tissue),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", 
    )



#######expression vs TF activity, testis, ovary, brain
p2<-ggplot(tmp4, aes(x=connectivity, y=logFPKM, color=Tissue)   )+geom_point(alpha=0.05)+stat_smooth(aes(fill=Tissue, linetype=Tissue), method="loess",color="black", alpha=0.8)+
 # xlim(0,20)+
  theme_classic()+xlab("Activity of a genes TF \n partners in tissue")+ylab("Gene expression in tissue (Log2 FPKM)")+scale_fill_manual(values = viridis_pal()(4)[c(1,3)])+scale_color_manual(values = viridis_pal()(4)[c(1,3)])

p3<-ggplot(subset(connectivity, Tissue=="Male Brain" |Tissue=="Female Brain"), aes(x=connectivity, y=logFPKM, color=Tissue)   )+geom_point(alpha=0.05)+stat_smooth(aes(fill=Tissue, linetype=Tissue), method="loess", color="black", alpha=0.8)+
 # xlim(0,20)+
  theme_classic()+xlab("Activity of a genes TF \n partners in tissue")+ylab("Gene expression in tissue (Log2 FPKM)")+scale_fill_manual(values = viridis_pal()(4)[c(1,3)])+scale_color_manual(values = viridis_pal()(4)[c(1,3)])
ks.test(subset(connectivity, Tissue=="Male Brain")$connectivity, subset(connectivity, Tissue=="Female Brain")$connectivity)
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  subset(connectivity, Tissue == "Male Brain")$connectivity and subset(connectivity, Tissue == "Female Brain")$connectivity
## D = 0.030634, p-value = 0.0001698
## alternative hypothesis: two-sided
#put plots together nicely
p4<-plot_grid(p2, p3, labels = c("C", "D"),rel_widths = c(0.95,1))

#lets add age vs nTF




#interesting.  It looks like low connectivity allows expression in testis###########

##Figure with Age vs nTF, age vs nPP, age vs FPKM############

#read in TF data
DroTF<-read.delim("~/Box Sync/Evan/Zhao lab notebook/Misc data/DroID_v2018_08/tf_gene.txt")
DroTF<-data.frame(table(DroTF$FLY_TARGET_GENE))
names(DroTF)<-c("gene", "nInteractions")
DroTF<-join(DroTF, flydata)
DroTF<-na.omit(DroTF)



stat.test <- compare_means(nInteractions ~ Age,DroTF ) %>%
  mutate(y.position = c(41, 55, 60))


#must put fill in geom_boxplot for stats to work.  No idea why.

DroTF<-DroTF %>% group_by(Age) %>% mutate(medtf=median(nInteractions))
p5<-ggplot(DroTF, aes(x=Age, y=nInteractions))+geom_violin(aes(fill=Age))+stat_summary(
    aes( x=Age, y=nInteractions),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", 
    ) +
  stat_pvalue_manual(data = stat.test, label = "p.adj= {format.pval(p.adj)}",  xmin = "group1", xmax = "group2", y.position = c(55,60,65), tip.length = 0) +
  theme_classic()+theme(legend.position='none',axis.text=element_text(size=10))+xlab("Gene age")+ylab("Number of TFs")+scale_fill_manual(values = viridis_pal()(4))+ylim(0,65)

p6<-plot_grid(p1, p5, labels=c("A", "B"), nrow=1)
p7<-plot_grid(p6, p4, nrow=2, rel_widths = c(1.3,1))
p7

#ggsave(plot = p7, "~/Box Sync/Evan/Zhao lab notebook/Writing/Tissue specificity paper/2001009_Figure3.pdf", width=10, height=8)
testisPeaks$V1<-paste("chr", testisPeaks$V1, sep = "")
ovaryPeaks$V1<-paste("chr", ovaryPeaks$V1, sep = "")
S2Peaks$V1<-paste("chr", S2Peaks$V1, sep = "")

names(testisPeaks)<-c("Chr","Start", "End", "Name", "score", "strand", "signalValue", "pvalue", "qvalue", "peak")
names(ovaryPeaks)<-c("Chr","Start", "End", "Name", "score", "strand", "signalValue", "pvalue", "qvalue", "peak")
names(S2Peaks)<-c("Chr","Start", "End", "Name", "score", "strand", "signalValue", "pvalue", "qvalue", "peak")

annoData <- toGRanges(TxDb.Dmelanogaster.UCSC.dm6.ensGene)

testisPeaks <- toGRanges(testisPeaks, format="narrowPeak")
testisPeaks<-subset(testisPeaks, seqnames %in% c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))
testisPeaks<-keepSeqlevels(testisPeaks, value = c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))

ovaryPeaks <- toGRanges(ovaryPeaks, format="narrowPeak")
ovaryPeaks<-subset(ovaryPeaks, seqnames %in% c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))
ovaryPeaks<-keepSeqlevels(ovaryPeaks, value = c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))

S2Peaks <- toGRanges(S2Peaks, format="narrowPeak")
S2Peaks<-subset(S2Peaks, seqnames %in% c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))
S2Peaks<-keepSeqlevels(S2Peaks, value = c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))

annoData2<-subset(annoData, seqnames %in% c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))
annoData2<-keepSeqlevels(annoData2, value = c("chr2L", "chr2R", "chr3L", "chr3R", "chrX",  "chrY", "chr4"))


# p values >.05 (it's negative log transformed in the data: -log10(.05)=1.3)
testisPeaks<-subset(testisPeaks, qvalue>1.3)
ovaryPeaks<-subset(ovaryPeaks, qvalue>1.3)
S2Peaks<-subset(S2Peaks, qvalue>1.3)


#testisAnno <- annotatePeakInBatch(testisPeaks, AnnotationData=annoData2, output ="overlapping",
#                                   FeatureLocForDistance="TSS",select = "all",
#                                   bindingRegion=c(-2000, 100))
#nTestisPeak<-data.frame(table((testisAnno$feature)))

testisAnno <- annotatePeakInBatch(testisPeaks, AnnotationData=annoData2, output ="overlapping",
                                   FeatureLocForDistance="TSS",select = "first",
                                   bindingRegion=c(-2000, 100))

#ovaryAnno <- annotatePeakInBatch(ovaryPeaks, AnnotationData=annoData2, output ="overlapping",
#                                   FeatureLocForDistance="TSS",select = "all",
#                                   bindingRegion=c(-2000, 100))
#nOvaryPeak<-data.frame(table(ovaryAnno$feature))

ovaryAnno <- annotatePeakInBatch(ovaryPeaks, AnnotationData=annoData2, output ="overlapping",
                                   FeatureLocForDistance="TSS",select = "first",
                                   bindingRegion=c(-2000, 100))




#names(nOvaryPeak)<-c("gene", "nPeaks")
#nOvaryPeak$Tissue<-"Ovary"
#names(nTestisPeak)<-c("gene", "nPeaks")
#nTestisPeak$Tissue<-"Testis"
#npeaks<-rbind(nOvaryPeak, nTestisPeak)
#npeaks<-join(npeaks, flydata[,c("Age", "gene")])
#npeaks<-na.omit(npeaks)
#ggplot(npeaks, aes(x=Age, y=nPeaks, fill=Tissue))+geom_violin()+theme_classic()+stat_compare_means(label="p.signif")

#stat.test <- npeaks %>%
#  group_by(Tissue) %>%
#  wilcox_test(nPeaks ~ Age) %>%
#  adjust_pvalue(method = "bonferroni") %>%
#  add_significance()+add_y_position()
#stat.test$y.position<-c(4,5,6,4,5,6)

#ggplot(npeaks, aes(x=Age, y=nPeaks, fill=Tissue))+geom_violin()+theme_classic()+facet_wrap(~Tissue)+stat_pvalue_manual(stat.test)


S2Anno <- annotatePeakInBatch(S2Peaks, AnnotationData=annoData2, output ="overlapping",
                                  FeatureLocForDistance="TSS",select = "first",
                                  bindingRegion=c(-2000, 100))




testisAnno<-data.frame(gene=unique(testisAnno$feature))
testisAnno$Tissue<-"Testis"
testisAnno$isPeak<-"Yes"
testisAnno<-join(testisAnno,flydata,by=c("gene", "Tissue"), type="left")
testisAnno<-na.omit(testisAnno)

ispeak<-join(data.frame(subset(flydata,Tissue=="Testis")), data.frame(gene=testisAnno$gene, isPeak=testisAnno$isPeak))
ispeak$isPeak<-ifelse(is.na(ispeak$isPeak), "No", "Yes" )

tissuepeaks<-ispeak

ovaryAnno<-data.frame(gene=unique(ovaryAnno$feature))
ovaryAnno$Tissue<-"Ovary"
ovaryAnno$isPeak<-"Yes"
ovaryAnno<-join(ovaryAnno,flydata,by=c("gene", "Tissue"))
ovaryAnno<-na.omit(ovaryAnno)
ispeak<-join(data.frame(subset(flydata,Tissue=="Ovary")), data.frame(gene=ovaryAnno$gene, isPeak=ovaryAnno$isPeak))
ispeak$isPeak<-ifelse(is.na(ispeak$isPeak), "No", "Yes" )

tissuepeaks<-rbind(tissuepeaks, ispeak)




S2Anno<-data.frame(gene=unique(S2Anno$feature))
S2Anno$Tissue<-"Male Carcass"
S2Anno$isPeak<-"Yes"
S2Anno<-join(S2Anno,flydata,by=c("gene", "Tissue"))
S2Anno<-na.omit(S2Anno)

ispeak<-join(data.frame(subset(flydata,Tissue=="Male Carcass")), data.frame(gene=S2Anno$gene, isPeak=S2Anno$isPeak))
ispeak$isPeak<-ifelse(is.na(ispeak$isPeak), "No", "Yes" )

tissuepeaks<-rbind(tissuepeaks, ispeak)
tissuepeaks<-unique(tissuepeaks)



nAge<-data.frame(age=table(unique(flydata[,c("Age", "gene")])$Age))
colnames(nAge)<-c("Age", "totalAge")

nTestis<-data.frame(table(subset(tissuepeaks, Tissue=="Testis" & isPeak=="Yes")$Age))
colnames(nTestis)<-c("Age", "totalTissue")
nTestis<-join(nTestis, nAge)
nTestis$prop<-nTestis$totalTissue/nTestis$totalAge
nTestis$Tissue<-"Testis"





nOvary<-data.frame(table(subset(tissuepeaks, Tissue=="Ovary" & isPeak=="Yes")$Age))
colnames(nOvary)<-c("Age", "totalTissue")
nOvary<-join(nOvary, nAge)
nOvary$prop<-nOvary$totalTissue/nOvary$totalAge
nOvary$Tissue<-"Ovary"

nS2<-data.frame(table(subset(tissuepeaks, Tissue=="Male Carcass" & isPeak=="Yes")$Age))
colnames(nS2)<-c("Age", "totalTissue")
nS2<-join(nS2, nAge)
nS2$prop<-nS2$totalTissue/nS2$totalAge
nS2$Tissue<-"S2"

#get proportion of genes with peaks in each tissue
propTissue<-rbind(nTestis, nOvary, nS2)

#get confidence intervals for proportions
tmp<-data.frame()
for (i in c(1:9)){
  tmp2<-prop.test(propTissue$totalTissue[i], propTissue$totalAge[i])$conf.int
tmp<-rbind(tmp, tmp2)
}

colnames(tmp)<-c("min", "max")
propTissue$min<-tmp$min
propTissue$max<-tmp$max

p1<-ggplot(propTissue, aes(x=Age, y=prop, group=Tissue, col=Tissue, fill=Tissue))+theme_classic()+theme(axis.text=element_text(size=10), legend.text=element_text(size=10))+
  geom_point(aes(fill=Tissue))+geom_line(size=1)+ylab("Proportions of genes\n with ATAC peaks in promoters")+
  geom_errorbar(aes(ymin=min, ymax=max), width=0.2, size=0.7)+ylim(0,1)+scale_color_manual(values = viridis_pal()(4))


#now make a figure showing that having a peak correlates with expression: Testis, ovary, wholebody

tissuepeaks<-tissuepeaks %>% group_by(Tissue, isPeak) %>%mutate(med=median(FPKM))

p2<-ggplot(subset(tissuepeaks, Tissue=="Testis"), aes(x= isPeak, y=FPKM,fill=isPeak))+geom_violin(scale="width")+stat_compare_means(label= "p.signif", label.x.npc=0.4, label.y = 48)+ylab("Testis FPKM")+theme_classic()+labs(fill="Testis ATAC peak")+theme(axis.text=element_text(size=10),legend.position='none', axis.title=element_text(size=10))+xlab("Does gene have ATAC-seq \n peak in testis?")+ylim(0,50)+stat_summary(
    aes( x=isPeak, y=FPKM),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", 
    )+scale_fill_manual(values = viridis_pal()(4))

p3<-ggplot(subset(tissuepeaks, Tissue=="Ovary"), aes(x= isPeak, y=FPKM, fill=isPeak))+geom_violin(scale="width")+stat_compare_means(label= "p.signif", label.x.npc=0.4, label.y=48)+ylab("Ovary FPKM")+theme_classic()+labs(fill="Ovary ATAC peak")+theme(axis.text=element_text(size=10),legend.position='none', axis.title=element_text(size=10))+xlab("Does gene have ATAC-seq \n peak in ovary?")+ylim(0,50)+stat_summary(
    aes( x=isPeak, y=FPKM),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", 
    )+scale_fill_manual(values = viridis_pal()(4))


p4<-ggplot(subset(tissuepeaks, Tissue=="Male Carcass"), aes(x= isPeak, y=FPKM, fill=isPeak))+geom_violin(scale="width")+stat_compare_means(label= "p.signif", label.x.npc=0.4, label.y=48)+ylab("Male Carcass FPKM")+theme_classic()+labs(fill="S2 ATAC peak")+theme(axis.text=element_text(size=10), legend.position='none',axis.title=element_text(size=10))+xlab("Does gene have ATAC-seq \n peak in S2 cells?")+ylim(0,50)+stat_summary(
    aes( x=isPeak, y=FPKM),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", 
    )+scale_fill_manual(values = viridis_pal()(4))


p5<-plot_grid(p2, p3, p4, nrow=1, labels=c("B", "C", "D"))
p6<-plot_grid(p1, p5, nrow=2, labels=c("A", ""))


tmp4<-join(tmp4, tissuepeaks[,c("gene", "Tissue", "isPeak")])
tmp4<-tmp4 %>%group_by(Tissue, isPeak) %>%mutate(med=median(connectivity))
p7<-ggplot(subset(tmp4, Tissue=="Testis"), aes(x=isPeak, y=connectivity, fill=isPeak ))+geom_violin(scale="width")+theme_classic()+stat_compare_means(method = "wilcox", label = "p.signif", label.x.npc = 0.4, label.y=20)+ylab("TF activity in testis") + xlab("Does gene have promoter ATAC-seq \npeak in testis?")+theme(legend.position='none')+stat_summary(
    aes( x=isPeak, y=connectivity),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", 
    )+scale_fill_manual(values = viridis_pal()(4))

p8<-ggplot(subset(tmp4, Tissue=="Ovary"), aes(x=isPeak, y=connectivity, fill=isPeak ))+geom_violin(scale="width")+theme_classic()+stat_compare_means(method = "wilcox", label = "p.signif", label.x.npc = 0.4, label.y=28)+ylab("TF activity in ovary") + xlab("Does gene have promoter ATAC-seq \npeak in ovary?")+theme(legend.position='none')+stat_summary(
    aes( x=isPeak, y=connectivity),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", 
    )+scale_fill_manual(values = viridis_pal()(4))
p9<-plot_grid(p7, p8, labels=c("E","F"))
########figure 4
plot_grid(p6,p9, nrow=2, rel_heights=c(2.2,1))

#ggsave("~/Box Sync/Evan/Zhao lab notebook/Writing/Tissue specificity paper/200910_figure4.pdf",width=9.5, height=11)


############proportion of shared peaks

testisovaryshared<-unique(join(testisAnno[,c("gene", "Age")], ovaryAnno[,c("gene", "Age")], type="inner"))
nTestisOvaryShared<-data.frame(table(testisovaryshared$Age))
colnames(nTestisOvaryShared)<-c("Age", "totalTissue")
nTestisOvaryShared<-join(nTestisOvaryShared, nAge)
nTestisOvaryShared$prop<-nTestisOvaryShared$totalTissue/nTestisOvaryShared$totalAge
nTestisOvaryShared$Tissue<-"Testis Ovary shared"


#expression vs ispeak, low TF activity, high TF activity
tmp4<-unique(join(connectivity, tissuepeaks))
tmp4<-subset(tmp4, Tissue=="Ovary" |Tissue=="Testis")

#median connectivity for each tissue
tmp4 %>%  group_by(Tissue)%>%summarise(med=median(connectivity))
## # A tibble: 2 x 2
##   Tissue   med
##   <chr>  <dbl>
## 1 Ovary   8.97
## 2 Testis  6.49
#ovary: 8.97, testis, 6.49
tmp5<-subset(tmp4, Tissue=="Ovary")
tmp5$connectivitybin<-ifelse(tmp5$connectivity<median(tmp5$connectivity), "Low", "High")
tmp6<-subset(tmp4, Tissue=="Testis")
tmp6$connectivitybin<-ifelse(tmp6$connectivity<median(tmp6$connectivity), "Low", "High")

tmp4<-rbind(tmp5, tmp6)
tmp4$connectivitybin<-factor(tmp4$connectivitybin, levels=c("Low", "High"))



tmp4 %>% group_by(Tissue, connectivitybin) %>% wilcox_test(logFPKM ~isPeak) %>% adjust_pvalue() 
## # A tibble: 4 x 10
##   Tissue connectivitybin .y.   group1 group2    n1    n2 statistic        p
##   <chr>  <fct>           <chr> <chr>  <chr>  <int> <int>     <dbl>    <dbl>
## 1 Ovary  Low             logF… No     Yes     3275  1719  1896010. 3.57e-82
## 2 Ovary  High            logF… No     Yes     1020  3974  1829472  1.57e- 6
## 3 Testis Low             logF… No     Yes     1773  3220  2406600  3.91e-20
## 4 Testis High            logF… No     Yes      352  4643   728158  6.44e- 4
## # … with 1 more variable: p.adj <dbl>
stat1<-tmp4 %>% group_by(Tissue, connectivitybin) %>% summarise(meds=median(FPKM))
stat2<-tmp4 %>% group_by(Tissue, isPeak) %>% summarise(meds=median(FPKM))


#Figure 5

#get medians for table 1, figure 5
tmp4<-tmp4 %>% group_by(Tissue, connectivitybin, isPeak)%>%mutate(med=median(FPKM))





ggplot(tmp4, aes(x=isPeak, y=FPKM,fill=connectivitybin))+geom_violin(scale="width")+
    stat_compare_means(aes(group=connectivitybin),label="p.signif")+facet_wrap(~Tissue)+theme_classic()+labs(color="",fill="TF activity", y="FPKM", x="Does gene have detectable open chromatin in its promoter?")+ylim(0,100)+scale_fill_manual(values = viridis_pal(option = "viridis")(4))+scale_color_manual(values=c("black","black"))+theme(axis.text=element_text(size=10), legend.text=element_text(size=10), axis.title = element_text(size=10))+
  stat_summary(
    aes(col = connectivitybin, x=isPeak, y=FPKM),
    fun.data="median_IQR", position=position_dodge(width=0.9), col="white", size=0.5
    )

supp_figure1

p1<-ggplot(subset(tmp4, Tissue=="Testis"), aes(x=isPeak, y=logFPKM, fill=connectivitybin) )+geom_boxplot(outlier.shape=NA)+stat_compare_means(aes(x=isPeak, y=logFPKM, fill=connectivitybin), method="wilcox")+theme_classic()+labs(fill="TF activity", x="Does gene have\n ATAC peak in testis?")
p2<-ggplot(subset(tmp4, Tissue=="Ovary"), aes(x=isPeak, y=logFPKM, fill=connectivitybin) )+geom_boxplot(outlier.shape=NA)+stat_compare_means(aes(x=isPeak, y=logFPKM, fill=connectivitybin), method="wilcox")+theme_classic()+labs(fill="TF activity",x="Does gene have\n ATAC peak in ovary?")
supp_figure_2<-plot_grid(p1, p2, labels=c("A", "B"))
supp_figure_2

##########
#########
########
########
#get supplemental figure with TF activity versus FPKM for every tissue.
supp_figure_3<-ggplot(connectivity, aes(x=connectivity, y=logFPKM, color=Tissue) )+stat_cor( label.y = 10, color="black")+stat_smooth(aes(fill=Tissue), method="loess", color="black", alpha=0.8)+facet_wrap(~Tissue)+theme_classic()+xlab("Activity of a genes TF \n partners in tissue")+ylab("Gene expression in tissue (Log2 FPKM)")+theme(legend.position='none')+ylim(0,15)
supp_figure_3

propexpressed$Tissue<-gsub(propexpressed$Tissue, pattern="Sperm", replacement=" Sperm")
#get proportion of expressed genes, by age
supp_figure4<-ggplot(propexpressed, aes(x=Age, y=prop, group=Tissue, col=Tissue, fill=Tissue))+theme_classic()+
  geom_point(aes(fill=Tissue))+geom_line()+ylab("Proportions of \n genes  with FPKM>2\n in tissue")+
  geom_errorbar(aes(ymin=min, ymax=max),width=0.4)+facet_wrap(~Tissue)+theme(axis.text=element_text(size=10),legend.position='none', axis.text.x=element_text(angle=90))+xlab("Gene age group (Young -> Old)")

supp_figure4